import warnings
warnings.filterwarnings("ignore")
def create_cv(df):
# create loocv splits
cv = KFold(n_splits=len(df), shuffle=False)
# initialize empty lists
idx_training, idx_test = [], []
# iterate over the cv splits adding them to a list
for fold in cv.split(df):
idx_training.append(fold[0]), idx_test.append(fold[1])
# return the training and test indices
return idx_training, idx_test
def loocv_shap(df,X,y,idx_training,idx_test, ag, out_dir, fname):
# initialize empty list to hold shap values from each fold
shap_values_per_fold_list = []
# class predicted from X_test
y_pred_list = []
# y_test
y_test_list = []
# predicted class probabilities
y_pred_probal = []
# iterate over folds
for train_outer_ix, test_outer_ix in zip(idx_training, idx_test):
X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :]
y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]
# scale X_train
scaler = StandardScaler()
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_train_s_df = pd.DataFrame(X_train_s, columns= X_train.columns.values.tolist())
# scale X_test
X_test_s = scaler.transform(X_test)
X_test_s_df = pd.DataFrame(X_test_s, columns= X_test.columns.values.tolist())
# define parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'penalty': ['l2'],'solver': ['liblinear'],'max_iter': [100, 200, 300]}
# inner cv
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
# model
model = LogisticRegression(random_state=42)
# perform grid search
grid_search = GridSearchCV(model, param_grid=param_grid, cv=inner_cv, scoring="f1", n_jobs=-1)
# fit model
grid_search.fit(X_train_s_df.to_numpy(), y_train)
# predict
y_pred = grid_search.predict(X_test_s_df.to_numpy())
# predict probability
y_pred_proba = grid_search.predict_proba(X_test_s_df.to_numpy())
# probability of positive class
y_pred_probal.append(list(y_pred_proba[0])[1])
# append y_pred and y_test to respective lists
y_pred_list.append(y_pred.item())
y_test_list.append(y_test.item())
# SHAP KernelExplainer
explainer = shap.KernelExplainer(grid_search.predict_proba, X_train_s_df)
# SHAP values
shap_values = explainer.shap_values(X_test_s_df)
# add shap values to list
data_reshaped = shap_values[1].reshape(1,-1)
shap_values_df = pd.DataFrame(data_reshaped, columns=list(X_train_s_df.columns))
shap_values_per_fold_list.append(shap_values_df)
# join shap values together
shap_values_df_concat = pd.concat(shap_values_per_fold_list).reset_index(drop=True)
# write out mean absolute shap values, all shap values, shap values with feature values
mean_abs_values = shap_values_df_concat.abs().mean().reset_index().sort_values([0],ascending=True)
mean_abs_values.columns = ['feature', 'mean_absolute_shap_value']
mean_abs_values.to_csv(out_dir + "/" + out_dir + "_mean_absolute_shap_values.csv", index=False)
df_sub_donor = df[['Donor']]
donor_shap_values_df = pd.concat([df_sub_donor, shap_values_df_concat], axis=1)
donor_shap_values_df.to_csv(out_dir + "/" + out_dir + "_all_shap_values.csv", index=False)
donor_shap_values_df_melt = pd.melt(donor_shap_values_df,id_vars=["Donor"])
df_sub_drop = df.drop(['Antigen','response'], axis=1)
df_sub_drop_melt = pd.melt(df_sub_drop, id_vars=["Donor"])
df_merge_shap_feat_long = donor_shap_values_df_melt.merge(df_sub_drop_melt, on=['Donor','variable'], how='left')
df_merge_shap_feat_long.columns = ['Donor','variable','shap_value','cell_composition']
df_merge_shap_feat_long.to_csv(out_dir + "/" + out_dir + "_shap_feature_values_long.csv", index=False)
# scoring metrics
conf_matrix = confusion_matrix(y_test_list, y_pred_list,labels = [0, 1])
conf_matrix_df = pd.DataFrame(conf_matrix)
conf_matrix_df.to_csv(out_dir + "/" + out_dir + "_confusion_matrix.csv", index=False)
fpr, tpr, _ = roc_curve(y_test_list, np.array(y_pred_probal))
roc_curve_df = pd.DataFrame({'fpr':fpr,'tpr':tpr})
roc_curve_df['Antigen'] = ag
roc_curve_df.to_csv(out_dir + "/" + out_dir + "_roc_curve_df.csv", index=False)
classification_report_df = pd.DataFrame(classification_report(y_test_list, y_pred_list, target_names = ["class 0","class 1"], output_dict=True)).transpose()
classification_report_df.to_csv(out_dir + "/" + out_dir + "_classification_report.csv")
accuracy = accuracy_score(y_test_list, y_pred_list)
f1 = f1_score(y_test_list, y_pred_list)
bal_acc = balanced_accuracy_score(y_test_list, y_pred_list)
roc_score = roc_auc_score(y_test_list, np.array(y_pred_probal))
scoring_df = pd.DataFrame({'model':[str(model)],'accuracy':[accuracy], 'balanced_accuracy':[bal_acc],'f1_score':[f1],'roc_auc_score':[roc_score],'pr_auc':[pr_auc],'antigen':[ag], 'filename':[fname]})
scoring_df.to_csv(out_dir + "/" + out_dir + "_model_scoring.csv", index=False)
# read in dataframe with features
filename = "manual_t_b_ha_dc_cell_comp_extreme_threshold_auc_modality.csv"
df = pd.read_csv(filename)
# get a list of the antigens
ag_list = list(set(df['Antigen']))
for ag in ag_list:
# subset antigen
df_sub = df[df['Antigen'] == ag]
# reset column index
df_sub.reset_index(inplace=True,drop=True)
# select target
y = df_sub['response']
# drop non relevant columns
X = df_sub.drop(['Donor','Antigen','response'], axis=1)
# setup cv
idx_training, idx_test = create_cv(X)
# construct output directory name
out_dir = filename.replace(".csv","_") + ag
# make the directory
os.makedirs(out_dir)
# run loocv shap
loocv_shap(df_sub, X, y, idx_training, idx_test, ag, out_dir, filename)